################################################################################
#
#                       美国民粹主义外交政策的民意基础                       
#
#                            付 舒    2025年10月     
#
################################################################################


#####（三）跨问卷插补的交叉检验  ######

### Hold-out Cross-Validation
set.seed(3573)

anes2020_popu_cv <- anes2020_popu %>%
  mutate(training = ifelse(runif(n()) < 0.5, 1, 0)) 

anes2020_popu_train <- anes2020_popu_cv[anes2020_popu_cv$training == 1 ,]
anes2020_popu_test <- anes2020_popu_cv[anes2020_popu_cv$training == 0,]


## 
lm_model_poli_cv <- lm(popu_poli ~ cv_vote2020trump + cv_ideology + cv_party + 
                      cv_religion_catho + cv_religion_chris + 
                      cv_houseowner + cv_female + cv_age + cv_marital_mar +
                      cv_race_white + cv_race_latin +
                      cv_education + cv_income,
                    data = anes2020_popu_train)

poli_pred_lm_cv <- predict(lm_model_poli_cv, newdata = anes2020_popu_test)

lm_model_econ_cv <- lm(popu_econ ~ cv_vote2020trump + cv_ideology + cv_party + 
                      cv_religion_catho + cv_religion_chris + 
                      cv_houseowner + cv_female + cv_age + cv_marital_mar +
                      cv_race_white + cv_race_black + cv_race_latin +
                      cv_education + cv_income,
                    data = anes2020_popu_train)

econ_pred_lm_cv <- predict(lm_model_econ_cv, newdata = anes2020_popu_test)

lm_model_cult_cv <- lm(popu_cult ~ cv_vote2020trump + cv_ideology + cv_party + 
                      cv_religion_catho + cv_religion_chris + 
                      cv_houseowner + cv_female + cv_age + cv_marital_mar +
                      cv_race_white + cv_race_black + cv_race_latin +
                      cv_education + cv_income,
                    data = anes2020_popu_train)

cult_pred_lm_cv <- predict(lm_model_cult_cv, newdata = anes2020_popu_test)



# fully saturated model
fs_model_poli_cv <- lm(popu_poli ~  (cv_vote2020trump + cv_ideology + cv_party + 
                                    cv_religion_catho + cv_religion_chris + 
                                    cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                    cv_race_white + cv_race_black + cv_race_latin +
                                    cv_education + cv_income) ^ 2,
                    data = anes2020_popu_train)

poli_pred_fs_cv <- predict(fs_model_poli_cv, newdata = anes2020_popu_test)

fs_model_econ_cv <- lm(popu_econ ~ (cv_vote2020trump + cv_ideology + cv_party + 
                                   cv_religion_catho + cv_religion_chris + 
                                   cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                   cv_race_white + cv_race_black + cv_race_latin +
                                   cv_education + cv_income) ^ 2,
                    data = anes2020_popu_train)

econ_pred_fs_cv <- predict(fs_model_econ_cv, newdata = anes2020_popu_test)

fs_model_cult_cv <- lm(popu_cult ~ (cv_vote2020trump + cv_ideology + cv_party + 
                                   cv_religion_catho + cv_religion_chris +
                                   cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                   cv_race_white + cv_race_black + cv_race_latin +
                                   cv_education + cv_income) ^ 2,
                    data = anes2020_popu_train)

cult_pred_fs_cv <- predict(fs_model_cult_cv, newdata = anes2020_popu_test)




# random forest
rf_model_poli_cv <- randomForest(popu_poli ~  cv_vote2020trump + cv_ideology + cv_party +    
                                cv_religion_catho + cv_religion_chris + 
                                cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                cv_race_white + cv_race_black + cv_race_latin +
                                cv_education + cv_income, 
                                data = anes2020_popu_train, ntree = 100)
poli_pred_rf_cv <- predict(rf_model_poli_cv, newdata = anes2020_popu_test)

rf_model_econ_cv <- randomForest(popu_econ ~  cv_vote2020trump + cv_ideology + cv_party +    
                                cv_religion_catho + cv_religion_chris + 
                                cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                cv_race_white + cv_race_black + cv_race_latin +
                                cv_education + cv_income, 
                                data = anes2020_popu_train, ntree = 100)
econ_pred_rf_cv <- predict(rf_model_econ_cv, newdata = anes2020_popu_test)

rf_model_cult_cv <- randomForest(popu_cult ~  cv_vote2020trump + cv_ideology + cv_party +     
                                cv_religion_catho + cv_religion_chris + 
                                cv_houseowner + cv_female + cv_age + cv_marital_mar +
                                cv_race_white + cv_race_black + cv_race_latin +
                                cv_education + cv_income, 
                                data = anes2020_popu_train, ntree = 100)
cult_pred_rf_cv <- predict(rf_model_cult_cv, newdata = anes2020_popu_test)


anes2020_popu_test <- anes2020_popu_test %>%
  mutate(poli_pred_lm_cv = poli_pred_lm_cv,
         econ_pred_lm_cv = econ_pred_lm_cv,
         cult_pred_lm_cv = cult_pred_lm_cv) %>%
  mutate(poli_pred_fs_cv = poli_pred_fs_cv,
         econ_pred_fs_cv = econ_pred_fs_cv,
         cult_pred_fs_cv = cult_pred_fs_cv) %>%
  mutate(poli_pred_rf_cv = poli_pred_rf_cv,
         econ_pred_rf_cv = econ_pred_rf_cv,
         cult_pred_rf_cv = cult_pred_rf_cv) 


poli_tab <- anes2020_popu_test %>%
  pivot_longer(
    c(poli_pred_lm_cv, poli_pred_fs_cv, poli_pred_rf_cv),
    names_to = c("dimension", "model"),
    names_pattern = "(.*)_pred_(.*)_cv",
    values_to = "yhat"
  ) %>%
  mutate(model = case_when(model == "lm" ~ "最小二乘",
                           model == "fs" ~ "完全饱和",
                           model == "rf" ~ "随机森林")) %>%
  mutate(model = factor(model, level = c("最小二乘", "完全饱和", "随机森林"))) %>%
  drop_na(popu_poli, yhat) %>%
  group_by(model) %>%
  summarise(
    RMSE = sqrt(mean((popu_poli - yhat)^2)),
    MAE  = mean(abs(popu_poli - yhat)),
    Cor  = cor(popu_poli, yhat),
    N    = n(),
  ) 

econ_tab <- anes2020_popu_test %>%
  pivot_longer(
    c(econ_pred_lm_cv, econ_pred_fs_cv, econ_pred_rf_cv),
    names_to = c("dimension", "model"),
    names_pattern = "(.*)_pred_(.*)_cv",
    values_to = "yhat"
  ) %>%
  mutate(model = case_when(model == "lm" ~ "最小二乘",
                           model == "fs" ~ "完全饱和",
                           model == "rf" ~ "随机森林")) %>%
  mutate(model = factor(model, level = c("最小二乘", "完全饱和", "随机森林"))) %>%
  drop_na(popu_econ, yhat) %>%
  group_by(model) %>%
  summarise(
    RMSE = sqrt(mean((popu_econ - yhat)^2)),
    MAE  = mean(abs(popu_econ - yhat)),
    Cor  = cor(popu_econ, yhat),
    N    = n(),
  ) 

cult_tab <- anes2020_popu_test %>%
  pivot_longer(
    c(cult_pred_lm_cv, cult_pred_fs_cv, cult_pred_rf_cv),
    names_to = c("dimension", "model"),
    names_pattern = "(.*)_pred_(.*)_cv",
    values_to = "yhat"
  ) %>%
  mutate(model = case_when(model == "lm" ~ "最小二乘",
                           model == "fs" ~ "完全饱和",
                           model == "rf" ~ "随机森林")) %>%
  mutate(model = factor(model, level = c("最小二乘", "完全饱和", "随机森林"))) %>%
  drop_na(popu_cult, yhat) %>%
  group_by(model) %>%
  summarise(
    RMSE = sqrt(mean((popu_cult - yhat)^2)),
    MAE  = mean(abs(popu_cult - yhat)),
    Cor  = cor(popu_cult, yhat),
    N    = n(),
  ) 

# 表 1
poli_tab
econ_tab
cult_tab


